Population and samples in statistics¶

Libraries and settings¶

In [1]:
# Libraries
import os
import numpy as np
import random
import statistics
import matplotlib.pyplot as plt
from IPython.display import Image

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show current working directory
print(os.getcwd())
/Users/darioholenstein/Documents/zhaw/data-analytics/data_analytics/Week_06

Chickens on a chicken farm (simulated data)¶

In [14]:
Image("electric_cars.png", width=800)
Out[14]:
In [ ]:
# Parameters of distribution
p_mean = 320
p_std = 80
p_n = 2500

# Setting a seed to restore the random numbers later
np.random.seed(42)

# Generate (normally distributed) data with mean = 320km and standard deviation = 80km
pop = np.random.normal(loc=p_mean, scale=p_std, size=p_n)

print(f'{len(pop):.0f}')
print(f'{np.mean(pop):.4f}')
print(f'{np.std(pop):.4f}')

# Plot histogram based on the data
plt.hist(pop, bins = 50, color='gold')
plt.title('Electric range WLTP (simulated data)')
plt.xlabel('Range (km)')
plt.ylabel('Cars')

plt.show()
2500
322.7011
78.6103

Single random sample from the population¶

In [26]:
# Sample size
sample_size = 100

# Setting a seed to restore the random numbers later
random.seed(42)

# Random sample
random_sample = random.sample(list(pop), sample_size)

# Statistics of the sample
print(f'Mean: {statistics.mean(random_sample):.4f}')
print(f'Standard deviation: {statistics.stdev(random_sample):.4f}')

# Plot histogram based on the sample
plt.hist(random_sample, bins = 50, color='deepskyblue')
plt.title(f'Electric range WLTP (single sample with n={sample_size})', fontsize=10)
plt.xlabel('Range (km)')
plt.ylabel('Cars')

plt.show()
Mean: 323.5502
Standard deviation: 75.5315

Repeated random sampling 'with replacement' (bootstrapping)¶

In [30]:
def bootstrap(p_mean= 320, p_std= 80, num_iter = 5000, sample_size = 200):
    """
    Creates bootstrap sample from a distribution.

    :param p_mean population mean. Type = int, float
    :param p_std. Type = int, float
    :param p_n. Type = int, float
    :param num_iterations: number of iterations. Type = int
    :param sample_size: size of single bootstrap sample. Type = int
    :return: plot of bootstrap sample.
    """

    # Setting a seed to restore the random numbers later
    random.seed(42)

    # Generate (normally distributed) data
    pop = np.random.normal(loc=p_mean, scale=p_std, size=p_n)

    # Create empty lists to save results of iterations
    sample_mean = []
    sample_std  = []

    # Loop for iterations
    for i in range(num_iter):
        samp = random.sample(pop.tolist(), sample_size)
        avg = np.mean(samp)
        std = np.std(samp)
        sample_mean.append(avg)
        sample_std.append(std)

    # Print mean and sd of bootsstapped distribution
    print(f'{np.mean(sample_mean):.4f}')
    print(f'{np.mean(sample_std):.4f}')

    # Plot histogram based on the bootstrapped data
    plt.hist(sample_mean, bins = 50, color='mediumorchid')
    plt.title('Electric range WLTP (bootstrap sample)', fontsize=10)
    plt.xlabel('Range (km)')
    plt.ylabel('Cars')
    plt.show()

# Function call
bootstrap(p_mean=320, p_std=80, num_iter = 1000, sample_size = 1000)
318.8035
78.6710

Jupyter notebook --footer info-- (please always provide this at the end of each notebook)¶

In [6]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')
-----------------------------------
POSIX
Darwin | 24.0.0
Datetime: 2024-11-06 10:57:02
Python Version: 3.11.5
-----------------------------------